{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CVR预估基线版本"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2.1 基于AD统计的版本"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# -*- coding: utf-8 -*-\n",
    "\"\"\"\n",
    "baseline 1: history pCVR of creativeID/adID/camgaignID/advertiserID/appID/appPlatform\n",
    "\"\"\"\n",
    "\n",
    "import zipfile\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "\n",
    "# load data\n",
    "data_root = \"E:\\dataset\\pre\"\n",
    "dfTrain = pd.read_csv(\"%s/train.csv\"%data_root)\n",
    "dfTest = pd.read_csv(\"%s/test.csv\"%data_root)\n",
    "dfAd = pd.read_csv(\"%s/ad.csv\"%data_root)\n",
    "\n",
    "# process data\n",
    "dfTrain = pd.merge(dfTrain, dfAd, on=\"creativeID\")\n",
    "dfTest = pd.merge(dfTest, dfAd, on=\"creativeID\")\n",
    "y_train = dfTrain[\"label\"].values\n",
    "\n",
    "# model building\n",
    "key = \"appID\"\n",
    "dfCvr = dfTrain.groupby(key).apply(lambda df: np.mean(df[\"label\"])).reset_index()\n",
    "dfCvr.columns = [key, \"avg_cvr\"]\n",
    "dfTest = pd.merge(dfTest, dfCvr, how=\"left\", on=key)\n",
    "dfTest[\"avg_cvr\"].fillna(np.mean(dfTrain[\"label\"]), inplace=True)\n",
    "proba_test = dfTest[\"avg_cvr\"].values\n",
    "\n",
    "# submission\n",
    "df = pd.DataFrame({\"instanceID\": dfTest[\"instanceID\"].values, \"proba\": proba_test})\n",
    "df.sort_values(\"instanceID\", inplace=True)\n",
    "df.to_csv(\"submission.csv\", index=False)\n",
    "with zipfile.ZipFile(\"submission.zip\", \"w\") as fout:\n",
    "    fout.write(\"submission.csv\", compress_type=zipfile.ZIP_DEFLATED)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 得分\n",
    "| Submission | 描述| 初赛A | 初赛B | 决赛A | 决赛B |\n",
    "| :------- | :-------: | :-------: | :-------: | :-------: | :-------: |\n",
    "| baseline 2.1 | ad 统计 | 0.10988 | - | - | - |"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2.2 AD+LR版本"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# -*- coding: utf-8 -*-\n",
    "\"\"\"\n",
    "baseline 2: ad.csv (creativeID/adID/camgaignID/advertiserID/appID/appPlatform) + lr\n",
    "\"\"\"\n",
    "\n",
    "import zipfile\n",
    "import pandas as pd\n",
    "from scipy import sparse\n",
    "from sklearn.preprocessing import OneHotEncoder\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "\n",
    "# load data\n",
    "data_root = \"./data\"\n",
    "dfTrain = pd.read_csv(\"%s/train.csv\"%data_root)\n",
    "dfTest = pd.read_csv(\"%s/test.csv\"%data_root)\n",
    "dfAd = pd.read_csv(\"%s/ad.csv\"%data_root)\n",
    "\n",
    "# process data\n",
    "dfTrain = pd.merge(dfTrain, dfAd, on=\"creativeID\")\n",
    "dfTest = pd.merge(dfTest, dfAd, on=\"creativeID\")\n",
    "y_train = dfTrain[\"label\"].values\n",
    "\n",
    "# feature engineering/encoding\n",
    "enc = OneHotEncoder()\n",
    "feats = [\"creativeID\", \"adID\", \"camgaignID\", \"advertiserID\", \"appID\", \"appPlatform\"]\n",
    "for i,feat in enumerate(feats):\n",
    "    x_train = enc.fit_transform(dfTrain[feat].values.reshape(-1, 1))\n",
    "    x_test = enc.transform(dfTest[feat].values.reshape(-1, 1))\n",
    "    if i == 0:\n",
    "        X_train, X_test = x_train, x_test\n",
    "    else:\n",
    "        X_train, X_test = sparse.hstack((X_train, x_train)), sparse.hstack((X_test, x_test))\n",
    "\n",
    "# model training\n",
    "lr = LogisticRegression()\n",
    "lr.fit(X_train, y_train)\n",
    "proba_test = lr.predict_proba(X_test)[:,1]\n",
    "\n",
    "# submission\n",
    "df = pd.DataFrame({\"instanceID\": dfTest[\"instanceID\"].values, \"proba\": proba_test})\n",
    "df.sort_values(\"instanceID\", inplace=True)\n",
    "df.to_csv(\"submission.csv\", index=False)\n",
    "with zipfile.ZipFile(\"submission.zip\", \"w\") as fout:\n",
    "    fout.write(\"submission.csv\", compress_type=zipfile.ZIP_DEFLATED)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 得分\n",
    "| Submission | 描述| 初赛A | 初赛B | 决赛A | 决赛B |\n",
    "| :------- | :-------: | :-------: | :-------: | :-------: | :-------: |\n",
    "| baseline 2.2 | ad + lr | 0.10743 | - | - | - |"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
